/*

Title: 					master_clean.do
Project: 				Leveraging Curriculum Reform: An Evaluation of In-Service Teacher Training to Support Competency-Based Curriculum Reform for Entrepreneurship Teachers in Rwanda
Date Created: 			December 15, 2019
Last Modified: 			February 15, 2019
Description: 			This .do file completes the Cleaning of the endline data: Student, Teacher and Headteacher surveys
Purpose:				* A) Renaming, ordering and removing non-important vars  
						* B) Checks on key aspects including outliers, strange values, consistency, duplicates etc.
						* C) Generates important analysis variables e.g. treatment etc.

================================================================================
		
*/

	
	// 
    
	// Setting directories

	clear
	set more off
	set maxvar 30000


cd "[directory]\06 Data" 


	global raw "Raw"
	global clean "Clean data"
	

	
	// PREPARE THE STUDENT TRACKING DATA FOR MERGING WITH THE MAIN ENDLINE STUDENT DATA

	clear all
	use "$raw/Student Tracking Survey.dta",clear 
		
	*check for duplicate studentid and clean below

	//isid studentid 	
	duplicates tag studentid, gen(dup)
	//br if dup>0
	

	do "clean tracking data.do"
	do "readreplace_dofiletrack.do"
	
	
	// run the locations do file for district, sector, cell and village code corrections
	
	do "destring_location.do" //all location variables in the tracking data set are now assigned value labels but we need to modify them to match the main endline data set
	do "Location_strings.do" 
	do "$raw/location_values.do" //all location variables are now assigned values (numbers) but they are string
	
	//correct and update the locations in the tracking data set
	
	rename (province_name district_name sector_name cell_name village_name) (province_100 district_101 sector_102 cell_103 village_104)

	do "correct_loc.do" // corrects locations
	do "school_labels.do" // assigns labels to schools
	
	save "$clean/Student_tracking_Final_CLEAN_14022019.dta", replace
	
		
	*****CLEANING THE MAIN ENDLINE SURVEY******
	clear all
	use "$raw/Student Survey Final.dta",clear 
	do "readreplace_student endline survey final.do" //all cleanings for the main endline student data set
	
	save "$clean/Student_Survey_Final_CLEAN_14022019.dta", replace
	
	//run cfvars to compare tracking and main endline data sets
	
    cfvars "$clean/Student_tracking_Final_CLEAN_14022019.dta" // compares the data in memory (main endline) with the cleaned tracking data
	
	
	//append the two data sets and begin to clean one large data set
	
	
	clear all
	use "$clean/Student_Survey_Final_CLEAN_14022019.dta", clear
	append using "$clean/Student_tracking_Final_CLEAN_14022019.dta"
	save "$clean/appended.dta", replace
	
	
	//Cleaning the appended data set
	
	clear all
	use "$clean/appended.dta", clear
	do "cleans_appended.do" //cleans the appended dataset
	save "$clean/appended_final.dta", replace
	
	***********ADDING GROUP TO THE APPENDED DATA SET****************************

	do"group_dofile.do"
	
	//run cfvars to compare tracking and main endline data sets
	
    cfvars "1895_dataset.dta" // compares the data in memory (main endline) with dataset#2 from the endline
	
	
	//append the two data sets: the last appended and 1985 dataset
	
	append using "1895_dataset.dta"
	save "$clean/appended_full.dta", replace
	
	//cleans the appended_full dataset or final one
	
	do "cleans_appendedfull.do" //cleans the appended_full data set
	*save "$clean/appended_full clean.dta", replace
	save "[directory]appended_full clean.dta", replace
